NSTI_for_KL has all the demographic, cost, and insurance data.


“Total Cost” is what we are interested in as outcome.

Goal is to determine whether there are characteristics that predict who will cost the most (top quartile).

NSTI_meds_for_KL has all the medications. - which patients got which antibiotics for how long - when they were started, changed, and stopped in relationship to admission and last OR visit.

NSTI_debride_for_KL gives list of OR visits

Variables(nsti: 78 variables)

Numerical

Categorical

Binary

Questions: Why some features aren’t on all patients

head(nsti)
summary(nsti)
##        V1          Patient           Discharge              AGE       
##  Min.   :  1.0   Length:432         Length:432         Min.   :10.00  
##  1st Qu.:108.8   Class :character   Class :character   1st Qu.:43.00  
##  Median :216.5   Mode  :character   Mode  :character   Median :55.00  
##  Mean   :216.5                                         Mean   :52.72  
##  3rd Qu.:324.2                                         3rd Qu.:62.00  
##  Max.   :432.0                                         Max.   :93.00  
##                                                                       
##      SEX                RACE             OUTCOME         
##  Length:432         Length:432         Length:432        
##  Class :character   Class :character   Class :character  
##  Mode  :character   Mode  :character   Mode  :character  
##                                                          
##                                                          
##                                                          
##                                                          
##    Admit.x             ICU.LOS         ICU.HOURS         HOSP.LOS     
##  Length:432         Min.   : 1.000   Min.   :   0.0   Min.   :  0.00  
##  Class :character   1st Qu.: 2.250   1st Qu.:  45.0   1st Qu.: 10.00  
##  Mode  :character   Median : 4.000   Median :  89.5   Median : 18.00  
##                     Mean   : 7.042   Mean   : 149.4   Mean   : 23.77  
##                     3rd Qu.: 8.000   3rd Qu.: 169.2   3rd Qu.: 30.00  
##                     Max.   :68.000   Max.   :1642.0   Max.   :317.00  
##                     NA's   :54                                        
##    VENT.DAYS        DC.DISPO         MECHANISM.OF.INFECTION
##  Min.   : 0.000   Length:432         Length:432            
##  1st Qu.: 0.000   Class :character   Class :character      
##  Median : 2.000   Mode  :character   Mode  :character      
##  Mean   : 3.104                                            
##  3rd Qu.: 4.000                                            
##  Max.   :43.000                                            
##                                                            
##    TRANSFER         CoMorbid.Codes      CO.MORBIDS       
##  Length:432         Length:432         Length:432        
##  Class :character   Class :character   Class :character  
##  Mode  :character   Mode  :character   Mode  :character  
##                                                          
##                                                          
##                                                          
##                                                          
##  LOCATION.CODE.1    LOCATION.CODE.2    ICD.10.CODE..1    
##  Length:432         Length:432         Length:432        
##  Class :character   Class :character   Class :character  
##  Mode  :character   Mode  :character   Mode  :character  
##                                                          
##                                                          
##                                                          
##                                                          
##  ICD.10..1.DESCRIPTION ICD.10.CODE..2     ICD10..2.DESCRITION
##  Length:432            Length:432         Length:432         
##  Class :character      Class :character   Class :character   
##  Mode  :character      Mode  :character   Mode  :character   
##                                                              
##                                                              
##                                                              
##                                                              
##     REGION          Debridement.at.REF DEBRIDE.COUNT         ARDS        
##  Length:432         Min.   :0.0000     Min.   : 0.000   Min.   :0.00000  
##  Class :character   1st Qu.:0.0000     1st Qu.: 2.000   1st Qu.:0.00000  
##  Mode  :character   Median :0.0000     Median : 3.000   Median :0.00000  
##                     Mean   :0.3519     Mean   : 3.106   Mean   :0.03704  
##                     3rd Qu.:1.0000     3rd Qu.: 4.000   3rd Qu.:0.00000  
##                     Max.   :1.0000     Max.   :10.000   Max.   :1.00000  
##                                                                          
##       ARF              CAUTI             CDIFF             CLABSI        
##  Min.   :0.00000   Min.   :0.00000   Min.   :0.00000   Min.   :0.000000  
##  1st Qu.:0.00000   1st Qu.:0.00000   1st Qu.:0.00000   1st Qu.:0.000000  
##  Median :0.00000   Median :0.00000   Median :0.00000   Median :0.000000  
##  Mean   :0.08333   Mean   :0.00463   Mean   :0.03009   Mean   :0.002315  
##  3rd Qu.:0.00000   3rd Qu.:0.00000   3rd Qu.:0.00000   3rd Qu.:0.000000  
##  Max.   :1.00000   Max.   :1.00000   Max.   :1.00000   Max.   :1.000000  
##                                                                          
##       CVA                CPR              DECUB        DVT        
##  Min.   :0.000000   Min.   :0.00000   Min.   :0   Min.   :0.0000  
##  1st Qu.:0.000000   1st Qu.:0.00000   1st Qu.:0   1st Qu.:0.0000  
##  Median :0.000000   Median :0.00000   Median :0   Median :0.0000  
##  Mean   :0.009259   Mean   :0.02778   Mean   :0   Mean   :0.0162  
##  3rd Qu.:0.000000   3rd Qu.:0.00000   3rd Qu.:0   3rd Qu.:0.0000  
##  Max.   :1.000000   Max.   :1.00000   Max.   :0   Max.   :1.0000  
##                                                                   
##  UNPLANNED.ETT           MI                 PE           UNPLANNED.OR     
##  Min.   :0.00000   Min.   :0.000000   Min.   :0.00000   Min.   :0.000000  
##  1st Qu.:0.00000   1st Qu.:0.000000   1st Qu.:0.00000   1st Qu.:0.000000  
##  Median :0.00000   Median :0.000000   Median :0.00000   Median :0.000000  
##  Mean   :0.02315   Mean   :0.006944   Mean   :0.01389   Mean   :0.009259  
##  3rd Qu.:0.00000   3rd Qu.:0.000000   3rd Qu.:0.00000   3rd Qu.:0.000000  
##  Max.   :1.00000   Max.   :1.000000   Max.   :1.00000   Max.   :1.000000  
##                                                                           
##      SEPSIS              VAP             WITHDRAWAL    BLEEDING       
##  Min.   :0.000000   Min.   :0.000000   Min.   :0    Min.   :0.000000  
##  1st Qu.:0.000000   1st Qu.:0.000000   1st Qu.:0    1st Qu.:0.000000  
##  Median :0.000000   Median :0.000000   Median :0    Median :0.000000  
##  Mean   :0.006944   Mean   :0.002315   Mean   :0    Mean   :0.002315  
##  3rd Qu.:0.000000   3rd Qu.:0.000000   3rd Qu.:0    3rd Qu.:0.000000  
##  Max.   :1.000000   Max.   :1.000000   Max.   :0    Max.   :1.000000  
##                                                                       
##       CHF               CIRR               CRF              CVA.1        
##  Min.   :0.00000   Min.   :0.000000   Min.   :0.00000   Min.   :0.00000  
##  1st Qu.:0.00000   1st Qu.:0.000000   1st Qu.:0.00000   1st Qu.:0.00000  
##  Median :0.00000   Median :0.000000   Median :0.00000   Median :0.00000  
##  Mean   :0.05093   Mean   :0.009259   Mean   :0.08102   Mean   :0.01389  
##  3rd Qu.:0.00000   3rd Qu.:0.000000   3rd Qu.:0.00000   3rd Qu.:0.00000  
##  Max.   :1.00000   Max.   :1.000000   Max.   :1.00000   Max.   :1.00000  
##                                                                          
##        DM            DEMENTIA          DRUG.ABUSE       ETOH.ABUSE    
##  Min.   :0.0000   Min.   :0.000000   Min.   :0.0000   Min.   :0.0000  
##  1st Qu.:0.0000   1st Qu.:0.000000   1st Qu.:0.0000   1st Qu.:0.0000  
##  Median :0.0000   Median :0.000000   Median :0.0000   Median :0.0000  
##  Mean   :0.4444   Mean   :0.002315   Mean   :0.1806   Mean   :0.0463  
##  3rd Qu.:1.0000   3rd Qu.:0.000000   3rd Qu.:0.0000   3rd Qu.:0.0000  
##  Max.   :1.0000   Max.   :1.000000   Max.   :1.0000   Max.   :1.0000  
##                                                                       
##       HTN              MI.1              RESP            SMOKER      
##  Min.   :0.0000   Min.   :0.00000   Min.   :0.0000   Min.   :0.0000  
##  1st Qu.:0.0000   1st Qu.:0.00000   1st Qu.:0.0000   1st Qu.:0.0000  
##  Median :0.0000   Median :0.00000   Median :0.0000   Median :0.0000  
##  Mean   :0.4259   Mean   :0.01157   Mean   :0.0625   Mean   :0.1968  
##  3rd Qu.:1.0000   3rd Qu.:0.00000   3rd Qu.:0.0000   3rd Qu.:0.0000  
##  Max.   :1.0000   Max.   :1.00000   Max.   :1.0000   Max.   :1.0000  
##                                                                      
##    Admit.y          Direct.Costs.Variance.Days  Direct.Costs   
##  Length:432         Min.   : 1403              Min.   :  2930  
##  Class :character   1st Qu.: 2940              1st Qu.: 25259  
##  Mode  :character   Median : 5447              Median : 40420  
##                     Mean   :12200              Mean   : 55591  
##                     3rd Qu.:14228              3rd Qu.: 68071  
##                     Max.   :98601              Max.   :385628  
##                     NA's   :397                                
##  Indirect.Costs    Total.Costs     Direct.Cost.Index PB.HMC.Direct.Costs
##  Min.   :  2365   Min.   :  5295   Min.   :0.0000    Min.   :    0      
##  1st Qu.: 19070   1st Qu.: 43944   1st Qu.:0.5500    1st Qu.:  379      
##  Median : 30431   Median : 71043   Median :0.8500    Median : 1184      
##  Mean   : 39763   Mean   : 95354   Mean   :0.9502    Mean   : 1504      
##  3rd Qu.: 50534   3rd Qu.:119026   3rd Qu.:1.2300    3rd Qu.: 2029      
##  Max.   :291884   Max.   :649590   Max.   :3.7100    Max.   :17031      
##                                                      NA's   :138        
##  PB.HMC.Indirect.Costs PB.HMC.Total.Costs PB.UWP.Gross.Charges
##  Min.   :   0.0        Min.   :    0      Min.   :     0      
##  1st Qu.: 231.2        1st Qu.:  669      1st Qu.:  9718      
##  Median : 547.5        Median : 1730      Median : 16166      
##  Mean   : 681.1        Mean   : 2185      Mean   : 20777      
##  3rd Qu.: 912.2        3rd Qu.: 2942      3rd Qu.: 25819      
##  Max.   :6884.0        Max.   :23915      Max.   :150769      
##  NA's   :138           NA's   :138        NA's   :138         
##  PB.UWP.Total.Cost.Allocation     Death           isFemale     
##  Min.   :     0               Min.   :0.0000   Min.   :0.0000  
##  1st Qu.:  7288               1st Qu.:0.0000   1st Qu.:0.0000  
##  Median : 12124               Median :0.0000   Median :0.0000  
##  Mean   : 15583               Mean   :0.1412   Mean   :0.3565  
##  3rd Qu.: 19364               3rd Qu.:0.0000   3rd Qu.:1.0000  
##  Max.   :113077               Max.   :1.0000   Max.   :1.0000  
##  NA's   :138                                                   
##  Highest_Quartile_Cost    Transfer      Vent.Free.Days     Insure         
##  Min.   :0.00          Min.   :0.0000   Min.   : 0.00   Length:432        
##  1st Qu.:0.00          1st Qu.:1.0000   1st Qu.:24.00   Class :character  
##  Median :0.00          Median :1.0000   Median :26.00   Mode  :character  
##  Mean   :0.25          Mean   :0.8935   Mean   :24.97                     
##  3rd Qu.:0.25          3rd Qu.:1.0000   3rd Qu.:28.00                     
##  Max.   :1.00          Max.   :1.0000   Max.   :28.00                     
##                                                                           
##  Insure_type          Amputation      Public_Insur   Private_Insur   
##  Length:432         Min.   :0.0000   Min.   :0.000   Min.   :0.0000  
##  Class :character   1st Qu.:0.0000   1st Qu.:0.000   1st Qu.:0.0000  
##  Mode  :character   Median :0.0000   Median :1.000   Median :0.0000  
##                     Mean   :0.1065   Mean   :0.713   Mean   :0.2315  
##                     3rd Qu.:0.0000   3rd Qu.:1.000   3rd Qu.:0.0000  
##                     Max.   :1.0000   Max.   :1.000   Max.   :1.0000  
##                                                                      
##    Uninsured           Other        
##  Min.   :0.00000   Min.   :0.00000  
##  1st Qu.:0.00000   1st Qu.:0.00000  
##  Median :0.00000   Median :0.00000  
##  Mean   :0.02315   Mean   :0.03241  
##  3rd Qu.:0.00000   3rd Qu.:0.00000  
##  Max.   :1.00000   Max.   :1.00000  
## 

Things I need to fix - SEX binary - discharge, admit x/y - death, is female, transfer is considered a numerical vs binary

NA - Some features have emtpy strings instead for missing values - Others have NA’s this usally starts when we get to PB costs

#removing the duplicate transfer binary 
nsti <- subset(nsti, select = -c(TRANSFER,OUTCOME, isFemale))
#converting character to date data format
dates = c("Discharge", "Admit.x", "Admit.y")
nsti[dates] <- lapply(nsti[dates], function(x) as.Date(x, "%m/%d/%Y"))

#replacing NOT in race to NA
nsti$RACE[ nsti$RACE == "NOT"] <- NA

#Converting categorical features to factor
cols <- c("SEX", "RACE", "DC.DISPO", "MECHANISM.OF.INFECTION", "LOCATION.CODE.1", "LOCATION.CODE.2", "ICD.10..1.DESCRIPTION", "REGION",
          "Insure_type", "ICD.10.CODE..1")
nsti[cols] <- lapply(nsti[cols], as.factor)

#Splitting features that are bound by semi colons into a list
nsti["CoMorbid.Codes"] <- lapply(nsti["CoMorbid.Codes"], function(x) strsplit(x, ";"))
nsti["Insure"] <- lapply(nsti["Insure"], function(x) strsplit(x, ";"))
nsti["Patient"] <- lapply(nsti["Patient"], function(x) str_sub(x, 1,-6))

Region is weird case in which is can have more than one region. Not sure how we want to address this.

Issue with patient 179*NSTI or V1 (70) which has a co.morbids value of asthma

nsti[nsti$V1 == 70,]

Debrid Changing the date features that were typed under character to actual date types as posicit as well as features that should be factors as well

#changing to date type
dates = c("ADMIT.DATE", "DEBRIDE.1.START.DATE", "DEB.2.DATE","DEB3.DATE", "DEB4.DATE", "REC1.DATE", "REC2.DATE", "REC3.DATE", "REC4.DATE", "DC.Date")
nsti_debrid[dates] <- lapply(nsti_debrid[dates], function(x) as.Date(x, "%m/%d/%Y"))

#changing to factors 
cols <- c("DEB1.CPT", "DEB2.CPT", "DEB3.CPT", "DEB4.CPT", "REC1CPT", "REC1.TOTAL", "REC1.EXP", "REC1.FLAP",
          "REC2.CPT", "REC2.EXP","REC2.FLAP", "Transfer")
nsti_debrid[cols] <- lapply(nsti_debrid[cols], as.factor)

Meds Converting the features to factors

cols <- c("Meds", "Med.Location")
nsti_Meds[cols] <- lapply(nsti_Meds[cols], as.factor)

Structuring the time data in medication

#creating a new dataframe with only these medications 
nstimeds_new <- data.frame(nsti_Meds[nsti_Meds["Meds"] != c("DOBU", "ENOX", "EPINEP", "NOPRESS", "STERIOD", "VASO", "WARF", 
                                                        "ANTIBIOT", "DOP", "HEP", "IVIG", "NOREPI", "PCC", "SOFA"),])
#removing med location
nstimeds_new <- subset(nstimeds_new, select = -c(Med.Location))

#removing entries that have a NOT date/time or UNK might go back and change if there is an alternative route 
nstimeds_new <- nstimeds_new[nstimeds_new["Med.Start.Date"] != "NOT" & nstimeds_new["Med.Start.Date"] != "UNK" & nstimeds_new["Med.Start.Date"] != "",]


#converting NOT to 0.00 Not a good system need to revise if we are using med location 
nstimeds_new[nstimeds_new == "NOT"] <- "0:00"
nstimeds_new[nstimeds_new == ""]  <- "0:00"
nstimeds_new[nstimeds_new == "UNK"]   <- "0:00"

# removes all na. Need to reassess later for 
nstimeds_new = nstimeds_new[complete.cases(nstimeds_new),]
#Converting time and date to a new feature with both 
nstimeds_new$totalTime <- strptime(paste(nstimeds_new$Med.Start.Date,nstimeds_new$Med.Start.Time, sep = " "),"%m/%d/%Y %H:%M")
#merging datasets with medication data
nsti_new <- merge(nsti,nsti_debrid, by.x = "Patient", by.y = "study_ID")
nsti_new <- nsti_new[nsti_new$Minutes.from > 0,]
nsti_new <- nsti_new[complete.cases(nsti_new$Minutes.from),]
#nsti_med_time <- subset(nsti_new, select = c(Patient, Admit.x, medsort,Minutes.from))

diff = last medication to first medication admittilast = minutes from admit to last deb debtomed = hours from last med to last deb

medsdf = data.frame(patient = integer(), med = character(), diff = character())


for (i in nsti_new$Patient){
  temp <- nstimeds_new[which(nstimeds_new$study_ID == i,),]
  meds = strsplit(paste(unique(temp$Meds)), " ")
  debridepatient = nsti_debrid %>% subset(study_ID == i)
  debridedate = strptime(paste(debridepatient$DEBRIDE.1.START.DATE,debridepatient$DEBRIDE.1.START.TIME, sep = " "),"%Y-%m-%d %H:%M") +
                debridepatient$Minutes.from.1st.Deb.to.last*60
  for (y in meds){
    tempmed <- temp[temp$Meds == y,]
    if (nrow(tempmed) >1){
      sortedtime <- tempmed[order(tempmed["totalTime"]),]$totalTime
      medconv <- data.frame(med1 = head(sortedtime, 1), med2 = tail(sortedtime, 1))
      medconv$diff <- difftime(medconv$med2,medconv$med1, units = "hours")
      if (medconv$diff %>% is_empty()) {
        break
      }
      medconv$med <- rep(y)
      medconv$admittolast <- nsti_new[nsti_new$Patient == i,]$Minutes.from
      medconv$debtomed <- difftime(medconv$med2, debridedate, units = "hours")
      medconv$patient <- i
      
      medsdf = rbind(medsdf, subset(medconv, select = c(diff, med, admittolast, patient, debtomed)))
    }
  }
} 

Vizualization towards understadning the new debride to med calculations

# finding out the outliers 
ggplot(data = melt(medsdf %>% subset(admittolast <12500) %>% subset(diff < 500) %>% subset(debtomed < 250)), aes(x=variable, y=value)) + 
  geom_boxplot() +
  facet_wrap(.~ variable, scale = "free")
## Using med, patient as id variables
## Warning: attributes are not identical across measure variables; they will
## be dropped

ggplot(data = medsdf , aes( x = admittolast)) +
  geom_histogram() +
  scale_colour_gradientn(colours = terrain.colors(10)) + 
  facet_wrap(.~ med)
## `stat_bin()` using `bins = 30`. Pick better value with `binwidth`.

ggplot(data = medsdf , aes( x = diff)) +
  geom_histogram() +
  scale_colour_gradientn(colours = terrain.colors(10)) + 
  facet_wrap(.~ med)
## Don't know how to automatically pick scale for object of type difftime. Defaulting to continuous.
## `stat_bin()` using `bins = 30`. Pick better value with `binwidth`.

ggplot(data = medsdf , aes( x = debtomed)) +
  geom_histogram() +
  scale_colour_gradientn(colours = terrain.colors(10)) + 
  facet_wrap(.~ med)
## Don't know how to automatically pick scale for object of type difftime. Defaulting to continuous.
## `stat_bin()` using `bins = 30`. Pick better value with `binwidth`.

test = medsdf %>% subset(admittolast <12500) %>% subset(diff < 500) %>% subset(debtomed < 250)

ggplot(data = medsdf, aes(y = diff, x = admittolast)) +
  geom_point() +
  scale_colour_gradientn(colours = terrain.colors(10)) +
  #facet_wrap(med~. ) +
  labs( x = "Admit to last debridement", y= "Difference from last to first med") +
  geom_text(aes(label=ifelse((admittolast>4*IQR(admittolast) | diff > 4*IQR(diff)),patient,"")), hjust=1.1)
## Don't know how to automatically pick scale for object of type difftime. Defaulting to continuous.

medwane <- data.frame(patient = integer())
medwanetreedf <- data.frame(patient = nsti_new$Patient)

for (i in nsti_new$Patient){
  if ((which(nstimeds_new$study_ID == i) %>% is_empty()) != TRUE){
    temp <- nstimeds_new[which(nstimeds_new$study_ID == i,),]
    temptime <- temp[order(temp["totalTime"]),]
    theDate <- head(temptime$totalTime, 1)
    end <- tail(temptime$totalTime, 1)
    daycount = 0
    while (theDate <= end){
      daycount = daycount + 1
      tempmed <- temptime %>% subset(totalTime >= theDate) %>% subset(totalTime <= theDate + days(1))
      tempmednum = data.frame(unimed = length(unique(tempmed$Meds)), patient = i, daycount = daycount)
      tempmednum$meds = list(unique(tempmed$Meds))
      medwanetreedf[which(medwanetreedf$patient == i),]$daycount <- length(unique(tempmed$Meds))
      medwane = rbind(medwane, tempmednum)
      
      theDate <- theDate + days(1)
    }
  }
} 



head(medwane$meds[1])
## [[1]]
## [1] CLIN LEVO PIPE VANC
## 26 Levels:  ANTIBIOT CEFA CIPR CLIN DOBU DOP ENOX EPINEP GENT HEP ... WARF
#data tree w/ sankey diagram

#nsti_tree <- as.data.frame()




#data.tree
head(medwane)
medwanetrans <- data.frame(patient = integer())

for (i in unique(medwane$patient)){
  temp <- medwane[medwane$patient == i,]
  tempdf <- data.frame(patient = i)
  tempdf$unimed <- list(temp$unimed)
  medwanetrans <- rbind(medwanetrans, tempdf)
}

medwanetrans$pathString <- paste("Meds A day",sapply(medwanetrans$unimed, paste, collapse = "/"), medwanetrans$patient, sep = "/")

medwanetrans$medcount <- sapply(medwanetrans$unimed, length)
medwanetrans %>% subset(medwanetrans$medcount <= 10)
testTree <- as.Node(medwanetrans %>% subset(medwanetrans$medcount <= 10))

print(testTree, limit = 200)
##                                                levelName
## 1   Meds A day                                          
## 2    ¦--3                                               
## 3    ¦   ¦--4                                           
## 4    ¦   ¦   ¦--3                                       
## 5    ¦   ¦   ¦   ¦--3                                   
## 6    ¦   ¦   ¦   ¦   ¦--102                             
## 7    ¦   ¦   ¦   ¦   °--2                               
## 8    ¦   ¦   ¦   ¦       °--129                         
## 9    ¦   ¦   ¦   °--2                                   
## 10   ¦   ¦   ¦       °--2                               
## 11   ¦   ¦   ¦           °--2                           
## 12   ¦   ¦   ¦               °--2                       
## 13   ¦   ¦   ¦                   °--2                   
## 14   ¦   ¦   ¦                       °--88              
## 15   ¦   ¦   ¦--2                                       
## 16   ¦   ¦   ¦   °--2                                   
## 17   ¦   ¦   ¦       °--3                               
## 18   ¦   ¦   ¦           °--2                           
## 19   ¦   ¦   ¦               °--2                       
## 20   ¦   ¦   ¦                   °--2                   
## 21   ¦   ¦   ¦                       °--157             
## 22   ¦   ¦   ¦--5                                       
## 23   ¦   ¦   ¦   °--4                                   
## 24   ¦   ¦   ¦       °--4                               
## 25   ¦   ¦   ¦           °--0                           
## 26   ¦   ¦   ¦               °--0                       
## 27   ¦   ¦   ¦                   °--0                   
## 28   ¦   ¦   ¦                       °--1               
## 29   ¦   ¦   ¦                           °--1           
## 30   ¦   ¦   ¦                               °--203     
## 31   ¦   ¦   °--4                                       
## 32   ¦   ¦       ¦--2                                   
## 33   ¦   ¦       ¦   °--1                               
## 34   ¦   ¦       ¦       °--1                           
## 35   ¦   ¦       ¦           °--1                       
## 36   ¦   ¦       ¦               °--1                   
## 37   ¦   ¦       ¦                   °--629             
## 38   ¦   ¦       °--4                                   
## 39   ¦   ¦           °--737                             
## 40   ¦   ¦--3                                           
## 41   ¦   ¦   ¦--3                                       
## 42   ¦   ¦   ¦   ¦--2                                   
## 43   ¦   ¦   ¦   ¦   ¦--2                               
## 44   ¦   ¦   ¦   ¦   ¦   °--1                           
## 45   ¦   ¦   ¦   ¦   ¦       °--1                       
## 46   ¦   ¦   ¦   ¦   ¦           °--121                 
## 47   ¦   ¦   ¦   ¦   ¦--66                              
## 48   ¦   ¦   ¦   ¦   °--3                               
## 49   ¦   ¦   ¦   ¦       °--2                           
## 50   ¦   ¦   ¦   ¦           °--1                       
## 51   ¦   ¦   ¦   ¦               °--684                 
## 52   ¦   ¦   ¦   ¦--0                                   
## 53   ¦   ¦   ¦   ¦   °--1                               
## 54   ¦   ¦   ¦   ¦       °--0                           
## 55   ¦   ¦   ¦   ¦           °--1                       
## 56   ¦   ¦   ¦   ¦               °--1                   
## 57   ¦   ¦   ¦   ¦                   °--1               
## 58   ¦   ¦   ¦   ¦                       °--665         
## 59   ¦   ¦   ¦   °--3                                   
## 60   ¦   ¦   ¦       °--94                              
## 61   ¦   ¦   ¦--2                                       
## 62   ¦   ¦   ¦   °--2                                   
## 63   ¦   ¦   ¦       °--122                             
## 64   ¦   ¦   ¦--5                                       
## 65   ¦   ¦   ¦   ¦--2                                   
## 66   ¦   ¦   ¦   ¦   °--4                               
## 67   ¦   ¦   ¦   ¦       °--2                           
## 68   ¦   ¦   ¦   ¦           °--2                       
## 69   ¦   ¦   ¦   ¦               °--174                 
## 70   ¦   ¦   ¦   °--4                                   
## 71   ¦   ¦   ¦       °--4                               
## 72   ¦   ¦   ¦           °--1                           
## 73   ¦   ¦   ¦               °--1                       
## 74   ¦   ¦   ¦                   °--1                   
## 75   ¦   ¦   ¦                       °--1               
## 76   ¦   ¦   ¦                           °--1           
## 77   ¦   ¦   ¦                               °--862     
## 78   ¦   ¦   °--4                                       
## 79   ¦   ¦       ¦--3                                   
## 80   ¦   ¦       ¦   °--3                               
## 81   ¦   ¦       ¦       °--2                           
## 82   ¦   ¦       ¦           °--2                       
## 83   ¦   ¦       ¦               °--2                   
## 84   ¦   ¦       ¦                   °--2               
## 85   ¦   ¦       ¦                       °--830         
## 86   ¦   ¦       °--2                                   
## 87   ¦   ¦           °--1                               
## 88   ¦   ¦               °--1                           
## 89   ¦   ¦                   °--1                       
## 90   ¦   ¦                       °--909                 
## 91   ¦   ¦--5                                           
## 92   ¦   ¦   ¦--4                                       
## 93   ¦   ¦   ¦   ¦--4                                   
## 94   ¦   ¦   ¦   ¦   ¦--4                               
## 95   ¦   ¦   ¦   ¦   ¦   °--4                           
## 96   ¦   ¦   ¦   ¦   ¦       ¦--128                     
## 97   ¦   ¦   ¦   ¦   ¦       °--4                       
## 98   ¦   ¦   ¦   ¦   ¦           °--1                   
## 99   ¦   ¦   ¦   ¦   ¦               °--659             
## 100  ¦   ¦   ¦   ¦   ¦--2                               
## 101  ¦   ¦   ¦   ¦   ¦   °--1                           
## 102  ¦   ¦   ¦   ¦   ¦       °--1                       
## 103  ¦   ¦   ¦   ¦   ¦           °--1                   
## 104  ¦   ¦   ¦   ¦   ¦               °--1               
## 105  ¦   ¦   ¦   ¦   ¦                   °--729         
## 106  ¦   ¦   ¦   ¦   °--3                               
## 107  ¦   ¦   ¦   ¦       °--3                           
## 108  ¦   ¦   ¦   ¦           °--1                       
## 109  ¦   ¦   ¦   ¦               °--1                   
## 110  ¦   ¦   ¦   ¦                   °--900             
## 111  ¦   ¦   ¦   ¦--3                                   
## 112  ¦   ¦   ¦   ¦   °--3                               
## 113  ¦   ¦   ¦   ¦       °--2                           
## 114  ¦   ¦   ¦   ¦           °--2                       
## 115  ¦   ¦   ¦   ¦               °--197                 
## 116  ¦   ¦   ¦   ¦--2                                   
## 117  ¦   ¦   ¦   ¦   °--2                               
## 118  ¦   ¦   ¦   ¦       °--2                           
## 119  ¦   ¦   ¦   ¦           °--2                       
## 120  ¦   ¦   ¦   ¦               °--1                   
## 121  ¦   ¦   ¦   ¦                   °--70              
## 122  ¦   ¦   ¦   °--5                                   
## 123  ¦   ¦   ¦       °--3                               
## 124  ¦   ¦   ¦           °--2                           
## 125  ¦   ¦   ¦               °--842                     
## 126  ¦   ¦   ¦--3                                       
## 127  ¦   ¦   ¦   °--3                                   
## 128  ¦   ¦   ¦       ¦--3                               
## 129  ¦   ¦   ¦       ¦   ¦--3                           
## 130  ¦   ¦   ¦       ¦   ¦   ¦--153                     
## 131  ¦   ¦   ¦       ¦   ¦   °--3                       
## 132  ¦   ¦   ¦       ¦   ¦       °--3                   
## 133  ¦   ¦   ¦       ¦   ¦           °--3               
## 134  ¦   ¦   ¦       ¦   ¦               °--41          
## 135  ¦   ¦   ¦       ¦   °--2                           
## 136  ¦   ¦   ¦       ¦       °--2                       
## 137  ¦   ¦   ¦       ¦           °--2                   
## 138  ¦   ¦   ¦       ¦               °--852             
## 139  ¦   ¦   ¦       °--2                               
## 140  ¦   ¦   ¦           °--2                           
## 141  ¦   ¦   ¦               °--1                       
## 142  ¦   ¦   ¦                   °--1                   
## 143  ¦   ¦   ¦                       °--1               
## 144  ¦   ¦   ¦                           °--76          
## 145  ¦   ¦   °--2                                       
## 146  ¦   ¦       °--1                                   
## 147  ¦   ¦           °--2                               
## 148  ¦   ¦               °--1                           
## 149  ¦   ¦                   °--1                       
## 150  ¦   ¦                       °--2                   
## 151  ¦   ¦                           °--687             
## 152  ¦   ¦--0                                           
## 153  ¦   ¦   ¦--5                                       
## 154  ¦   ¦   ¦   ¦--2                                   
## 155  ¦   ¦   ¦   ¦   °--2                               
## 156  ¦   ¦   ¦   ¦       °--2                           
## 157  ¦   ¦   ¦   ¦           °--3                       
## 158  ¦   ¦   ¦   ¦               °--2                   
## 159  ¦   ¦   ¦   ¦                   °--2               
## 160  ¦   ¦   ¦   ¦                       °--1           
## 161  ¦   ¦   ¦   ¦                           °--143     
## 162  ¦   ¦   ¦   ¦--4                                   
## 163  ¦   ¦   ¦   ¦   °--4                               
## 164  ¦   ¦   ¦   ¦       °--3                           
## 165  ¦   ¦   ¦   ¦           °--3                       
## 166  ¦   ¦   ¦   ¦               °--18                  
## 167  ¦   ¦   ¦   °--6                                   
## 168  ¦   ¦   ¦       °--3                               
## 169  ¦   ¦   ¦           °--3                           
## 170  ¦   ¦   ¦               °--1                       
## 171  ¦   ¦   ¦                   °--1                   
## 172  ¦   ¦   ¦                       °--1               
## 173  ¦   ¦   ¦                           °--806         
## 174  ¦   ¦   ¦--0                                       
## 175  ¦   ¦   ¦   °--0                                   
## 176  ¦   ¦   ¦       °--3                               
## 177  ¦   ¦   ¦           °--4                           
## 178  ¦   ¦   ¦               °--416                     
## 179  ¦   ¦   ¦--1                                       
## 180  ¦   ¦   ¦   °--3                                   
## 181  ¦   ¦   ¦       °--73                              
## 182  ¦   ¦   ¦--10                                      
## 183  ¦   ¦   ¦   °--3                                   
## 184  ¦   ¦   ¦       °--747                             
## 185  ¦   ¦   °--3                                       
## 186  ¦   ¦       °--4                                   
## 187  ¦   ¦           °--3                               
## 188  ¦   ¦               °--3                           
## 189  ¦   ¦                   °--1                       
## 190  ¦   ¦                       °--1                   
## 191  ¦   ¦                           °--77              
## 192  ¦   ¦--1                                           
## 193  ¦   ¦   °--2                                       
## 194  ¦   ¦--6                                           
## 195  ¦   ¦   ¦--4                                       
## 196  ¦   ¦   ¦   °--4                                   
## 197  ¦   ¦   ¦       °--4                               
## 198  ¦   ¦   ¦           °--4                           
## 199  ¦   ¦   ¦               °--3                       
## 200  ¦   ¦   ¦                   °--... 1 nodes w/ 2 sub
## 201  ¦   ¦   °--... 2 nodes w/ 8 sub                    
## 202  ¦   °--... 5 nodes w/ 35 sub                       
## 203  °--... 8 nodes w/ 726 sub
png("test.png", width = 600, height = 600)
plot(testTree)
ToDiagrammeRGraph(testTree)
## DiagrammeR Graph // 933 nodes / 932 edges
##   -- directed / connected / DAG / simple
## 
##   NODES / type: <unused> / label: 216 vals - complete
##     -- no additional node attributes
##   EDGES / rel: <unused>                               info: `get_edge_df()`
##     -- no additional edge attributes
##   SELECTION / <none>
##   CACHE / <none>
##   STORED DFs / <none>
##   GLOBAL ATTRS / <none>                info: `get_global_graph_attr_info()`
##   GRAPH ACTIONS / <none>
##   GRAPH LOG / <2 actions> -> () -> () -> ()
# Sankey Diagram

head(medwane$meds[[3]])
## [1] PCNG CLIN VANC LEVO
## 26 Levels:  ANTIBIOT CEFA CIPR CLIN DOBU DOP ENOX EPINEP GENT HEP ... WARF
p <- plot_ly(
    type = "sankey",
    orientation = "h",

    node = list(
      label = c("A1", "A2", "B1", "B2", "C1", "C2"),
      color = c("blue", "blue", "blue", "blue", "blue", "blue"),
      pad = 15,
      thickness = 20,
      line = list(
        color = "black",
        width = 0.5
      )
    ),

    link = list(
      source = c(0,1,0,2,3,3),
      target = c(2,3,3,4,4,5),
      value =  c(8,4,2,8,4,2)
    )
  ) %>% 
  layout(
    title = "Basic Sankey Diagram",
    font = list(
      size = 10
    )
)
ggplot(data = medwane, aes(x = daycount, y = unimed)) +
  geom_jitter()

ggplot(data = medwane %>% subset(daycount < 50), aes(x= daycount)) +
  geom_density()

#looks nice but pointless
ggplot(data = medwane %>% subset(daycount < 50), aes(x= daycount, y = unimed)) +
  geom_bar(stat = "identity",aes(fill = patient)) +
  theme(legend.position = "none") 

ggplot(data = medwane %>% subset(daycount < 30), aes(y = unimed, x =daycount)) + 
  geom_boxplot() +
  scale_y_continuous(breaks=c(0:10)) +
  facet_grid(.~daycount, scale = "free")
## Warning: Continuous x aesthetic -- did you forget aes(group=...)?

ggplot(data = medwane, aes(y = unimed, x =daycount)) + 
  #geom_line() +
  scale_y_continuous(breaks=c(0:10))+
  theme(legend.position = "none") +
  geom_smooth(method = 'loess', se = FALSE)

#Creating a data structure that categorizes medication, time and orders them within a list in a list 
# in the nsti data as it has the actual length of patients 
nsti$medsort <- list("")

for (i in nsti$Patient){
  temp <- nstimeds_new[which(nstimeds_new$study_ID == i,),]
  meds = strsplit(paste(unique(temp$Meds)), " ")
  medsdf = list()
  for (y in meds){
    tempmed <- temp[temp$Meds == y,]
    medsdf[[y]] <- tempmed[order(tempmed["totalTime"]),]$totalTime
  }
  nsti[nsti$Patient == i,]$medsort <- list(medsdf)
} 
#head(nsti$medsort)
#ggplot(data = medsdf, aes(x = as.numeric(row.names(medsdf)),y = hours)) + 
#  geom_step(direction = "hv")
#ggplot(data = tail(medsdf,-3), aes(x = as.numeric(row.names(tail(medsdf,-3))),y = hours)) + 
#  geom_step(direction = "hv")
# graphing all of the points from patients 

medsdf = data.frame(order = integer(), med = character(), diff = character())

for (i in nsti_new$Patient){
  temp <- nstimeds_new[which(nstimeds_new$study_ID == i,),]
  meds = strsplit(paste(unique(temp$Meds)), " ")
  for (y in meds){
    tempmed <- temp[temp$Meds == y,]
    if (nrow(tempmed) >1){
      sortedtime <- tempmed[order(tempmed["totalTime"]),]$totalTime
      medconv <- data.frame(med1 = head(sortedtime, -1), med2 = tail(sortedtime, -1))
      medconv$diff <- difftime(medconv$med2,medconv$med1, units = "hours")
      medconv <- medconv %>% subset(diff > 0) # removes any where that the difference in time is a zero 
      if (medconv$diff %>% is_empty()) {
        break
      }
      medconv$med <- rep(y)
      medconv$order <- as.numeric(row.names(medconv))
      medconv$admittolast <- nsti_new[nsti_new$Patient == i,]$Minutes.from
      medconv$patient <- i
      medsdf = rbind(medsdf, subset(medconv, select = c(diff, med,order, admittolast, patient)))
    }
  }
} 
medconv$diff %>% is_empty()
## [1] FALSE
test = medsdf %>% subset(diff < 30) %>% subset(admittolast <40000)
ggplot(data = test[test$med == "VANC",], aes(x = order, y = diff, colour = admittolast)) +
  geom_point() +
  scale_colour_gradientn(colours = terrain.colors(10)) +
  labs(colour = "Admit to last DB", y = "Minutes between each medication change (Hours)", x = "Order of medication change", title = "VANC")
## Don't know how to automatically pick scale for object of type difftime. Defaulting to continuous.

ggplot(data = test[test$med == "CLIN",], aes(x = order, y = diff, colour = admittolast)) +
  geom_point() +
  scale_colour_gradientn(colours = terrain.colors(10)) +
  labs(colour = "Admit to last DB", y = "Minutes between each medication change (Hours)", x = "Order of medication change", title = "CLIN")
## Don't know how to automatically pick scale for object of type difftime. Defaulting to continuous.

ggplot(data = test[test$med == "PCNG",], aes(x = order, y = diff, colour = admittolast)) +
  geom_point() +
  scale_colour_gradientn(colours = terrain.colors(10)) +
  labs(colour = "Admit to last DB", y = "Minutes between each medication change (Hours)", x = "Order of medication change", title = "PCNG")
## Don't know how to automatically pick scale for object of type difftime. Defaulting to continuous.

Seems like half of the antibiotics dont have many datapoints so it is better to elimate those

No to almost no data available - ANTIBIOT,HEP, NOREPI, PCC, GENT

Limited amount of data - CIPR, LEVO, METR, PIPE, SOFA

A lot of data - CLIN, MERO, PCNG, VANC

ggplot(data = test, aes(x = order, y = diff, colour = admittolast)) +
  geom_point() +
  scale_colour_gradientn(colours = terrain.colors(10)) +
  facet_grid(med~. ) +
  labs(colour = "Admit to last DB", y = "Minutes between each medication change (Hours)", x = "Order of medication change")
## Don't know how to automatically pick scale for object of type difftime. Defaulting to continuous.

#ANTIBIOT,HEP, NOREPI, PCC, GENT 
lowdatamed = test[which(test$med == "ANTIBIOT"| test$med == "NOREPI" |test$med == "PCC" |test$med == "GENT" ),]
ggplot(data = lowdatamed, aes(x = order, y = diff, colour = admittolast)) +
  geom_point() +
  scale_colour_gradientn(colours = terrain.colors(10)) +
  facet_grid(med~. ) +
  labs(colour = "Admit to last DB", y = "Minutes between each medication change (Hours)", x = "Order of medication change")
## Don't know how to automatically pick scale for object of type difftime. Defaulting to continuous.

meddatamed = test[which(test$med == "CIPR"| test$med == "LEVO" |test$med == "METR" |test$med == "PIPE" |test$med == "SOFA"),]

ggplot(data = meddatamed, aes(x = order, y = diff, colour = admittolast)) +
  geom_point() +
  scale_colour_gradientn(colours = terrain.colors(10)) +
  facet_grid(med~. ) +
  labs(colour = "Admit to last DB", y = "Minutes between each medication change (Hours)", x = "Order of medication change")
## Don't know how to automatically pick scale for object of type difftime. Defaulting to continuous.

highdatamed = test[which(test$med == "CLIN"| test$med == "MERO" |test$med == "PCNG" |test$med == "VANC" ),]

ggplot(data = highdatamed, aes(x = order, y = diff, colour = admittolast)) +
  geom_point() +
  scale_colour_gradientn(colours = terrain.colors(10)) +
  facet_grid(med~. ) +
  labs(colour = "Admit to last DB", y = "Minutes between each medication change (Hours)", x = "Order of medication change")
## Don't know how to automatically pick scale for object of type difftime. Defaulting to continuous.

highdatamed = test[which(test$med == "CLIN"| test$med == "MERO" |test$med == "VANC" ),]

ggplot(data = highdatamed, aes(x = order, y = diff, colour = admittolast)) +
  geom_point() +
  scale_colour_gradientn(colours = terrain.colors(10)) +
  facet_grid(med~. ) +
  labs(colour = "Admit to last DB", y = "Minutes between each medication change (Hours)", x = "Order of medication change")
## Don't know how to automatically pick scale for object of type difftime. Defaulting to continuous.

#filteredtest <- test[which(test$med == "CLIN"| test$med == "MERO" |test$med == "VANC"|test$med == "CIPR"| test$med == "LEVO" |test$med == "METR" |test$med == "PIPE" |test$med == "SOFA"),]

#ggplot(data = filteredtest, aes(x = admittolast, fill = med)) +
#  geom_histogram(position = "fill")

#ggplot(data = filteredtest, aes(x = admittolast, fill = med)) +
#  geom_histogram(position = "stack")
#highdatamed = test[which(test$med == "CLIN"| test$med == "MERO" |test$med == "PCNG" |test$med == "VANC" ),]
#ggplot(data = test[which(test$patient == 1),], aes(x = order, y = diff, col = patient)) +
#  geom_line() +
##  facet_grid(med~. )+
#  theme(legend.position = "none") 
#CPPROC2014 %>% subset(PROC == 9670)